import pickle, os, time, itertools, csv
import pandas as pd

with open("temp/family_class_second.pickle", "rb") as f:
	family_class = pickle.load(f)

# 둘 이상의 family_id를 갖는 epodoc 추출
df = pd.read_csv("temp/family_class_second.csv")
df = df.sort_values('epodoc')
dup = df[df.duplicated('epodoc', keep = False)]
dup = dup.reset_index(drop = True)
print(dup)

# Complete list of pairs of families which need to be corrected.
failure = set()
fault = set([dup.iloc[0, 1], ])
for index, row in dup.iterrows():
	try:
		next_row = dup.iloc[index+1, :]
	except IndexError:
		break
	epodoc = row[0]
	next_epodoc = next_row[0]
	if epodoc == next_epodoc:
		fault.add(next_row[1])
	else:
		fault = frozenset(fault)
		failure.add(fault)
		fault = set([next_row[1], ])
print(failure)
print(len(failure))

# Correction
subset = {}
union = {}
for error in failure:
	error = list(error)
	fam1, fam2 = family_class[error[0]], family_class[error[1]]
	if fam1.issubset(fam2) == True:
		subset[fam1] = fam2
	elif fam2.issubset(fam1) == True:
		subset[fam2] = fam1
	else:
		union[fam1] = fam2
		union[fam2] = fam1

# family_class에 에러를 반영함
for id, fam in family_class.copy().items():
	if fam in subset.keys():
		del family_class[id]
	elif fam in union.keys():
		del family_class[id]
		fam_update = fam.union(union[fam])
		if fam_update not in family_class.values():
			family_class[len(family_class)+1] = fam_update


# family_class_second
family_class_third = {}
id = 1
for family in set(family_class.values()):
	family_class_third[id] = family
	id += 1


# csv 파일로 저장
with open("temp/family_class_third.csv", "w", encoding="utf-8", newline="") as f:
	wf = csv.writer(f)
	wf.writerow(["epodoc", "id"])
	for id, family in family_class_third.items():
		for epodoc in family:
			epodoc = epodoc.strip()
			wf.writerow([epodoc, id])

# pickle 파일로 저장
with open("temp/family_class_third.pickle", "wb") as f:
	pickle.dump(family_class_third, f)

# final check
df = pd.read_csv("temp/family_class_third.csv")
df = df.sort_values('epodoc')
dup = df[df.duplicated('epodoc', keep = False)]
dup = dup.reset_index(drop = True)
print(dup)

# family_class_final.dta 파일로 저장
df.to_stata("result/family_class_final.dta", write_index = False)